A Random Forest classifier will be trained. The data set will consist of only the users that had their text classified by the text classifier. This restriction is added because we would like to include the percent of pro-protester tweets as a feature of this classification.
Resources: CS109 Homework #5
In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
In [128]:
#users with classified tweets
user_tc = pd.read_pickle('final_aug_percents.pkl')
In [129]:
#count is the number of tweets with hashtag #Ferguson or #ferguson
#perc_p is the percent of the user's tweets that have been classified as 'pro-protester'
user_tc.head()
Out[129]:
In [5]:
#load august_reduced_all for data for Aug 10 - 17
df_aug = pd.read_csv('/home/data/aug_reduced_all.csv')
In [45]:
list(df_aug.columns.values)
Out[45]:
In [7]:
df_nov = pd.read_csv('/home/data/nov_reduced.csv')
In [8]:
df_nov.head()
Out[8]:
In [9]:
nov = df_nov[['user.screen_name', '_iso_created_at']]
In [10]:
nov_df = pd.DataFrame({'count' : nov.groupby( [ "user.screen_name"] ).size()}).reset_index()
In [11]:
nov_df.head()
Out[11]:
The following features will be used:
In [ ]:
#group by id and get average friends
fr_df = df_aug[['user.screen_name', 'user.friends_count']].dropna(how='all')
friends = fr_df.groupby(['user.screen_name'], as_index=False).mean()
In [ ]:
#group by id and get average followers
fo_df = df_aug[['user.screen_name', 'user.followers_count']].dropna(how='all')
fo_df['user.followers_count'] = fo_df['user.followers_count'].convert_objects(convert_numeric=True)
followers = fo_df.groupby(['user.screen_name'], as_index=False).mean()
In [ ]:
#group by id and get max of total tweets
tt_df = df_aug[['user.screen_name', 'user.statuses_count']].dropna(how='all')
tt_df['user.statuses_count'] = tt_df['user.statuses_count'].convert_objects(convert_numeric=True)
total_tweets = tt_df.groupby(['user.screen_name'], as_index=False).max()
In [38]:
#restrict august dataframe to users in user_tc dataframe
aug = df_aug[(df_aug['user.screen_name'].isin(user_tc['user.screen_name']))].reset_index()
#http://stackoverflow.com/questions/12096252/use-a-list-of-values-to-select-rows-from-a-pandas-dataframe
In [61]:
#get reply count for each user
for i in range(0, len(user_tc)):
count = aug[aug['user.screen_name'] == user_tc.loc[i, 'user.screen_name']]['in_reply_to_screen_name'].count()
user_tc.loc[i, 'total_replies'] = count
In [64]:
#get retweet count for each user
for i in range(0, len(user_tc)):
count = aug[aug['user.screen_name'] == user_tc.loc[i, 'user.screen_name']]['retweeted_status.user.id'].count()
user_tc.loc[i, 'total_retweets'] = count
In [69]:
#calculate %retweets and %replies for the tweets with the #F/ferguson hashtag
user_tc['pct_replies'] = user_tc['total_replies'] / user_tc['count']
user_tc['pct_retweets'] = user_tc['total_retweets'] / user_tc['count']
In [70]:
user_tc.head()
Out[70]:
In [72]:
#merge all feature dataframes together
features = user_tc.merge(friends, on = 'user.screen_name', how = 'inner').merge(followers, on='user.screen_name', how='inner').merge(total_tweets, on='user.screen_name', how='inner')
#code help from http://stackoverflow.com/questions/23668427/pandas-joining-multiple-dataframes-on-columns
In [73]:
features.head()
Out[73]:
In [74]:
#merge the august features dataframe with the november dataframe
aug_nov = pd.merge(features, nov_df, on='user.screen_name', how='left')
In [109]:
#determine if user remained engaged (1 = yes, 0 = no)
for i in range(0, len(aug_nov)):
if aug_nov.loc[i, 'count_y'] >= 10:
aug_nov.loc[i, 'rem_eng'] = 1
else:
aug_nov.loc[i, 'rem_eng'] = 0
In [110]:
aug_nov.head()
Out[110]:
Calculate % remaining engaged in this sample.
In [111]:
rem_eng = len(aug_nov[aug_nov['rem_eng'] == 1])
not_rem_eng = len(aug_nov[aug_nov['rem_eng'] == 0])
print rem_eng*1.0 / (rem_eng + not_rem_eng)
In [112]:
#convert rem_eng to numpy array called Y
Y = np.array(aug_nov.rem_eng)
In [113]:
#drop user.screen_name
features_d = features.drop('user.screen_name', axis = 1)
features_d = features_d.fillna(0)
features_d.head()
Out[113]:
In [114]:
#convert features to matrix
X = features_d.as_matrix()
Features key:
In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
#calculate cross val scores for each random forest
scores = []
for i in range(1, 31):
rfc = RandomForestClassifier(n_estimators=i)
score = cross_val_score(rfc, X, Y, cv=10)
scores.append(score)
#calculate mean score for each random forest
score_means = np.mean(scores, axis=1)
trees = np.arange(30)+1
plt.figure(figsize=(15,8))
plt.scatter(trees, score_means, c='k', zorder=2)
sns.boxplot(scores)
plt.xlabel("Number of Trees")
plt.ylabel("Cross Validation Score")
plt.title("Cross Validation Score vs. Number of Trees")
plt.show()
16 trees looks good. Now we'll try with F1 scores.
In [91]:
#calcuate F1 scores for each random forest
scores_f1 = []
for i in range(1, 26):
rfc = RandomForestClassifier(n_estimators=i)
score = cross_val_score(rfc, X, Y, cv=10, scoring='f1')
scores_f1.append(score)
#caluclate mean F1 score for each random forest
score_means_f1 = np.mean(scores_f1, axis=1)
trees = np.arange(25)+1
plt.figure(figsize=(15,8))
plt.scatter(trees, score_means_f1, c='k',zorder=2)
sns.boxplot(scores_f1)
plt.xlabel("Number of Trees")
plt.ylabel("F1 Score")
plt.title("Cross Validation Score using F1 Scoring vs. Number of Trees")
plt.show()
16 trees still looks good, but we know this data set is unbalanced, so we'll try with a custom cut-off.
In [92]:
def cutoff_predict(clf, X, cutoff):
#generate prediction probabilities
prob = clf.predict_proba(X)
#convert probabilites to predictions
clf_p = np.empty(len(prob))
for i in range(len(prob)):
if prob[i][1] > cutoff:
clf_p[i] = 1
else:
clf_p[i] = 0
return clf_p
#code from Homework #5
In [93]:
def custom_f1(cutoff):
def f1_cutoff(clf, X, y):
ypred = cutoff_predict(clf, X, cutoff)
return sklearn.metrics.f1_score(y, ypred)
return f1_cutoff
#code from Homework #5
#set range of cutoffs
cutoff_range = np.arange(0.1, 0.9, 0.1)
#set up Random Forest Classifier
rfc_c = RandomForestClassifier(n_estimators=15)
#Calculate custom F1 scores for each random forest
scores_cc = []
for i in cutoff_range:
score_cc = cross_val_score(rfc_c, X, Y, cv=10, scoring = custom_f1(i))
scores_cc.append(score_cc)
plt.figure(figsize=(15,8))
sns.boxplot(scores_cc, names=cutoff_range)
plt.xlabel("Cutoff Value")
plt.ylabel("F1 Score (using custom cutoff value)")
plt.title("Cross Validation Score with F1 Scoring vs. Cutoff Value")
plt.show()
In [100]:
#calcuate F1 scores for each random forest
scores_f1 = []
for i in range(1, 26):
rfc = RandomForestClassifier(n_estimators=i)
score = cross_val_score(rfc, X, Y, cv=10, scoring=custom_f1(0.2))
scores_f1.append(score)
#caluclate mean F1 score for each random forest
score_means_f1 = np.mean(scores_f1, axis=1)
trees = np.arange(25)+1
plt.figure(figsize=(15,8))
plt.scatter(trees, score_means_f1, c='k',zorder=2)
sns.boxplot(scores_f1)
plt.xlabel("Number of Trees")
plt.ylabel("F1 Score")
plt.title("Cross Validation Score using Custom F1 Scoring (Cutoff = 0.2) vs. Number of Trees")
plt.show()
With the custom cutoff F1 score, 6 trees seems optimal. The most feature importances will be generated using 6 trees.
In [127]:
#set up and fit a random forest classifier
rfc_id = RandomForestClassifier(n_estimators=6)
rfc_id.fit(X, Y)
#calculate feature importances
importances = rfc_id.feature_importances_
index = np.arange(len(importances))
plt.bar(index, importances)
plt.xticks(index+0.5,features_d.columns.values, rotation=75)
plt.show()
In [115]:
#prep dataframe for pair plots
all_aug_nov = aug_nov.drop('user.screen_name', axis = 1)
all_aug_nov = all_aug_nov.drop('count_y', axis = 1)
In [87]:
all_aug_nov.head()
Out[87]:
In [88]:
sns.pairplot(all_aug_nov, hue='rem_eng')
Out[88]: